Linear Regression

Problem: Predication of Weight

Author: Ayush Tickoo


In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, cross_validation
from sklearn.linear_model import LinearRegression
import warnings
import matplotlib.pyplot as plt
from matplotlib import style


df = pd.read_csv('Height.csv')


/home/maestro/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [76]:
df


Out[76]:
Unnamed: 0 Height(Inches) Weight(Pounds)
0 NaN 65.78 112.99
1 NaN 71.52 136.49
2 NaN 69.40 153.03
3 NaN 68.22 142.34
4 NaN 67.79 144.30
5 NaN 68.70 123.30
6 NaN 69.80 141.49
7 NaN 70.01 136.46
8 NaN 67.90 112.37
9 NaN 66.78 120.67
10 NaN 66.49 127.45
11 NaN 67.62 114.14
12 NaN 68.30 125.61
13 NaN 67.12 122.46
14 NaN 68.28 116.09
15 NaN 71.09 140.00
16 NaN 66.46 129.50
17 NaN 68.65 142.97
18 NaN 71.23 137.90
19 NaN 67.13 124.04
20 NaN 67.83 141.28
21 NaN 68.88 143.54
22 NaN 63.48 97.90
23 NaN 68.42 129.50
24 NaN 67.63 141.85
25 NaN 67.21 129.72
26 NaN 70.84 142.42
27 NaN 67.49 131.55
28 NaN 66.53 108.33
29 NaN 65.44 113.89
... ... ... ...
170 NaN 69.43 122.61
171 NaN 67.97 124.21
172 NaN 67.76 124.65
173 NaN 65.28 119.52
174 NaN 73.83 139.30
175 NaN 66.81 104.83
176 NaN 66.89 123.04
177 NaN 65.74 118.89
178 NaN 65.98 121.49
179 NaN 66.58 119.25
180 NaN 67.11 135.02
181 NaN 65.87 116.23
182 NaN 66.78 109.17
183 NaN 68.74 124.22
184 NaN 66.23 141.16
185 NaN 65.96 129.15
186 NaN 68.58 127.87
187 NaN 66.59 120.92
188 NaN 66.97 127.65
189 NaN 68.08 101.47
190 NaN 70.19 144.99
191 NaN 65.52 110.95
192 NaN 67.46 132.86
193 NaN 67.41 146.34
194 NaN 69.66 145.59
195 NaN 65.80 120.84
196 NaN 66.11 115.78
197 NaN 68.24 128.30
198 NaN 68.02 127.47
199 NaN 71.39 127.88

200 rows × 3 columns

We will refine our data set


In [77]:
df = df[['Height(Inches)','Weight(Pounds)']]

In [78]:
df


Out[78]:
Height(Inches) Weight(Pounds)
0 65.78 112.99
1 71.52 136.49
2 69.40 153.03
3 68.22 142.34
4 67.79 144.30
5 68.70 123.30
6 69.80 141.49
7 70.01 136.46
8 67.90 112.37
9 66.78 120.67
10 66.49 127.45
11 67.62 114.14
12 68.30 125.61
13 67.12 122.46
14 68.28 116.09
15 71.09 140.00
16 66.46 129.50
17 68.65 142.97
18 71.23 137.90
19 67.13 124.04
20 67.83 141.28
21 68.88 143.54
22 63.48 97.90
23 68.42 129.50
24 67.63 141.85
25 67.21 129.72
26 70.84 142.42
27 67.49 131.55
28 66.53 108.33
29 65.44 113.89
... ... ...
170 69.43 122.61
171 67.97 124.21
172 67.76 124.65
173 65.28 119.52
174 73.83 139.30
175 66.81 104.83
176 66.89 123.04
177 65.74 118.89
178 65.98 121.49
179 66.58 119.25
180 67.11 135.02
181 65.87 116.23
182 66.78 109.17
183 68.74 124.22
184 66.23 141.16
185 65.96 129.15
186 68.58 127.87
187 66.59 120.92
188 66.97 127.65
189 68.08 101.47
190 70.19 144.99
191 65.52 110.95
192 67.46 132.86
193 67.41 146.34
194 69.66 145.59
195 65.80 120.84
196 66.11 115.78
197 68.24 128.30
198 68.02 127.47
199 71.39 127.88

200 rows × 2 columns


In [116]:
X = np.array(df['Height(Inches)'])
y = np.array(df['Weight(Pounds)'])

#Shaping to tell that only one feature is there
X = X.reshape(-1, 1)

Crossvalidation parameters


In [117]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.4)

Using Classifier


In [87]:
cf = LinearRegression()

In [120]:
cf.fit(X_train, y_train)
accuracy = cf.score(X_test, y_test)
print('Accuracy: ',int(accuracy*100),'%')


Accuracy:  30 %

In [131]:
plt.scatter(X, y,  color = 'black')
plt.plot(X, cf.predict(X), color = 'red', linewidth=1)
plt.xlabel('Height(Inches)', color = 'blue')
plt.ylabel('Weight(Pounds)', color = 'blue')
plt.show()